{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SpaCy Tutorial\n",
    "#### 02 Intermediate spaCy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create an nlp object\n",
    "nlp = spacy.load(\"en_core_web_md\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Word Vectors  \n",
    "SpaCy models except for the small one come with word vectors that are trained in GloVe, a similar algorithm to Word2Vec. SpaCy allows to use other pretrained word vectors or your custom word vectors, such as ones that are trained in Gensim, FastText, TensorFlow, etc.  \n",
    "\n",
    "Examples below show how to compute similarity between words/sentences, but you can also feed the vector representation as a feature to downstream tasks, such as text classifier in your favorite ML framework."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-0.15067   -0.024468  -0.23368   -0.23378   -0.18382    0.32711\n",
      " -0.22084   -0.28777    0.12759    1.1656    -0.64163   -0.098455\n",
      " -0.62397    0.010431  -0.25653    0.31799    0.037779   1.1904\n",
      " -0.17714   -0.2595    -0.31461    0.038825  -0.15713   -0.13484\n",
      "  0.36936   -0.30562   -0.40619   -0.38965    0.3686     0.013963\n",
      " -0.6895     0.004066  -0.1367     0.32564    0.24688   -0.14011\n",
      "  0.53889   -0.80441   -0.1777    -0.12922    0.16303    0.14917\n",
      " -0.068429  -0.33922    0.18495   -0.082544  -0.46892    0.39581\n",
      " -0.13742   -0.35132    0.22223   -0.144     -0.048287   0.3379\n",
      " -0.31916    0.20526    0.098624  -0.23877    0.045338   0.43941\n",
      "  0.030385  -0.013821  -0.093273  -0.18178    0.19438   -0.3782\n",
      "  0.70144    0.16236    0.0059111  0.024898  -0.13613   -0.11425\n",
      " -0.31598   -0.14209    0.028194   0.5419    -0.42413   -0.599\n",
      "  0.24976   -0.27003    0.14964    0.29287   -0.31281    0.16543\n",
      " -0.21045   -0.4408     1.2174     0.51236    0.56209    0.14131\n",
      "  0.092514   0.71396   -0.021051  -0.33704   -0.20275   -0.36181\n",
      "  0.22055   -0.25665    0.28425   -0.16968    0.058029   0.61182\n",
      "  0.31576   -0.079185   0.35538   -0.51236    0.4235    -0.30033\n",
      " -0.22376    0.15223   -0.048292   0.23532    0.46507   -0.67579\n",
      " -0.32905    0.08446   -0.22123   -0.045333   0.34463   -0.1455\n",
      " -0.18047   -0.17887    0.96879   -1.0028    -0.47343    0.28542\n",
      "  0.56382   -0.33211   -0.38275   -0.2749    -0.22955   -0.24265\n",
      " -0.37689    0.24822    0.36941    0.14651   -0.37864    0.31134\n",
      " -0.28449    0.36948   -2.8174    -0.38319   -0.022373   0.56376\n",
      "  0.40131   -0.42131   -0.11311   -0.17317    0.1411    -0.13194\n",
      "  0.18494    0.097692  -0.097341  -0.23987    0.16631   -0.28556\n",
      "  0.0038654  0.53292   -0.32367   -0.38744    0.27011   -0.34181\n",
      " -0.27702   -0.67279   -0.10771   -0.062189  -0.24783   -0.070884\n",
      " -0.20898    0.062404   0.022372   0.13408    0.1305    -0.19546\n",
      " -0.46849    0.77731   -0.043978   0.3827    -0.23376    1.0457\n",
      " -0.14371   -0.3565    -0.080713  -0.31047   -0.57822   -0.28067\n",
      " -0.069678   0.068929  -0.16227   -0.63934   -0.62149    0.11222\n",
      " -0.16969   -0.54637    0.49661    0.46565    0.088294  -0.48496\n",
      "  0.69263   -0.068977  -0.53709    0.20802   -0.42987   -0.11921\n",
      "  0.1174    -0.18443    0.43797   -0.1236     0.3607    -0.19608\n",
      " -0.35366    0.18808   -0.5061     0.14455   -0.024368  -0.10772\n",
      " -0.0115     0.58634   -0.054461   0.0076487 -0.056297   0.27193\n",
      "  0.23096   -0.29296   -0.24325    0.10317   -0.10014    0.7089\n",
      "  0.17402   -0.0037509 -0.46304    0.11806   -0.16457   -0.38609\n",
      "  0.14524    0.098122  -0.12352   -0.1047     0.39047   -0.3063\n",
      " -0.65375   -0.0044248 -0.033876   0.037114  -0.27472    0.0053147\n",
      "  0.30737    0.12528   -0.19527   -0.16461    0.087518  -0.051107\n",
      " -0.16323    0.521      0.10822   -0.060379  -0.71735   -0.064327\n",
      "  0.37043   -0.41054   -0.2728    -0.30217    0.015771  -0.43056\n",
      "  0.35647    0.17188   -0.54598   -0.21541   -0.044889  -0.10597\n",
      " -0.54391    0.53908    0.070938   0.097839   0.097908   0.17805\n",
      "  0.18995    0.49962   -0.18529    0.051234   0.019574   0.24805\n",
      "  0.3144    -0.29304    0.54235    0.46672    0.26017   -0.44705\n",
      "  0.28287   -0.033345  -0.33181   -0.10902   -0.023324   0.2106\n",
      " -0.29633    0.81506    0.038524   0.46004    0.17187   -0.29804  ] Dimension: 300\n"
     ]
    }
   ],
   "source": [
    "# Printing a word vector for \"cat\" and its demension size\n",
    "word1 = nlp(\"cat\")\n",
    "print(f\"{word1.vector} Dimension: {len(word1.vector)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8016855517329495\n"
     ]
    }
   ],
   "source": [
    "# Cosine similarity of cat and dog\n",
    "word2 = nlp(\"dog\")\n",
    "print(word1.similarity(word2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8223489933262161\n"
     ]
    }
   ],
   "source": [
    "# Cosine similarity of 2 different documents by averaging word vectors\n",
    "doc1 = nlp(\"I like sushi.\")\n",
    "doc2 = nlp(\"My favorite food is ramen.\")\n",
    "print(doc1.similarity(doc2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.31461387624735404\n"
     ]
    }
   ],
   "source": [
    "# An unrelated sentence pair returns low value\n",
    "doc1 = nlp(\"I like sushi.\")\n",
    "doc2 = nlp(\"Jupyter notebook installation guide\")\n",
    "print(doc1.similarity(doc2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with Big Dataset  \n",
    "This Twitter data was downloaded from [Kaggle](https://www.kaggle.com/c/twitter-sentiment-analysis2)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 3)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('data/train.csv.zip', sep=',', compression='zip', encoding='latin_1')\n",
    "df = df.sample(1000, random_state=1111)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ItemID</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>SentimentText</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1266</th>\n",
       "      <td>1267</td>\n",
       "      <td>1</td>\n",
       "      <td>Awesome. &amp;lt;3 TEDDY! &amp;lt;3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39898</th>\n",
       "      <td>39910</td>\n",
       "      <td>1</td>\n",
       "      <td>@Andre_R its been my mission to find other sou...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88879</th>\n",
       "      <td>88891</td>\n",
       "      <td>1</td>\n",
       "      <td>@Azura999 Told you</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3688</th>\n",
       "      <td>3689</td>\n",
       "      <td>1</td>\n",
       "      <td>thanks for the birthday DMs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68437</th>\n",
       "      <td>68449</td>\n",
       "      <td>0</td>\n",
       "      <td>@bryandl i know!  you should come visit again!!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28556</th>\n",
       "      <td>28568</td>\n",
       "      <td>0</td>\n",
       "      <td>@airlanggatwerp bagi link nya dong nce huhu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48974</th>\n",
       "      <td>48986</td>\n",
       "      <td>1</td>\n",
       "      <td>@AriaParadiso @ChelseaParadiso nighty night u ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68698</th>\n",
       "      <td>68710</td>\n",
       "      <td>0</td>\n",
       "      <td>@BSBSavedMyLife it won't play</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90592</th>\n",
       "      <td>90604</td>\n",
       "      <td>0</td>\n",
       "      <td>@chiniehdiaz Im out of it..havent had any in d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55359</th>\n",
       "      <td>55371</td>\n",
       "      <td>1</td>\n",
       "      <td>@barrymoltz Ahhh, yes-- shiny object syndrome....</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       ItemID  Sentiment                                      SentimentText\n",
       "1266     1267          1                        Awesome. &lt;3 TEDDY! &lt;3\n",
       "39898   39910          1  @Andre_R its been my mission to find other sou...\n",
       "88879   88891          1                                @Azura999 Told you \n",
       "3688     3689          1                        thanks for the birthday DMs\n",
       "68437   68449          0    @bryandl i know!  you should come visit again!!\n",
       "28556   28568          0       @airlanggatwerp bagi link nya dong nce huhu \n",
       "48974   48986          1  @AriaParadiso @ChelseaParadiso nighty night u ...\n",
       "68698   68710          0                     @BSBSavedMyLife it won't play \n",
       "90592   90604          0  @chiniehdiaz Im out of it..havent had any in d...\n",
       "55359   55371          1  @barrymoltz Ahhh, yes-- shiny object syndrome...."
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Typical way (slow)\n",
    "Single-threaded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(text:str=None):\n",
    "    doc = nlp(text)\n",
    "    token_list = []\n",
    "    \n",
    "    for token in doc:\n",
    "        token_list.append(token.text)\n",
    "        \n",
    "    return token_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.89 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "df['token_list1'] = df.apply(lambda x: tokenize(x.SentimentText), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ItemID</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>SentimentText</th>\n",
       "      <th>token_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1266</th>\n",
       "      <td>1267</td>\n",
       "      <td>1</td>\n",
       "      <td>Awesome. &amp;lt;3 TEDDY! &amp;lt;3</td>\n",
       "      <td>[ , Awesome, ., &amp;, lt;3, TEDDY, !, &amp;, lt;3]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39898</th>\n",
       "      <td>39910</td>\n",
       "      <td>1</td>\n",
       "      <td>@Andre_R its been my mission to find other sou...</td>\n",
       "      <td>[@Andre_R, its, been, my, mission, to, find, o...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88879</th>\n",
       "      <td>88891</td>\n",
       "      <td>1</td>\n",
       "      <td>@Azura999 Told you</td>\n",
       "      <td>[@Azura999, Told, you]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3688</th>\n",
       "      <td>3689</td>\n",
       "      <td>1</td>\n",
       "      <td>thanks for the birthday DMs</td>\n",
       "      <td>[ , thanks, for, the, birthday, DMs]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68437</th>\n",
       "      <td>68449</td>\n",
       "      <td>0</td>\n",
       "      <td>@bryandl i know!  you should come visit again!!</td>\n",
       "      <td>[@bryandl, i, know, !,  , you, should, come, v...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28556</th>\n",
       "      <td>28568</td>\n",
       "      <td>0</td>\n",
       "      <td>@airlanggatwerp bagi link nya dong nce huhu</td>\n",
       "      <td>[@airlanggatwerp, bagi, link, nya, dong, nce, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48974</th>\n",
       "      <td>48986</td>\n",
       "      <td>1</td>\n",
       "      <td>@AriaParadiso @ChelseaParadiso nighty night u ...</td>\n",
       "      <td>[@AriaParadiso, @ChelseaParadiso, nighty, nigh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68698</th>\n",
       "      <td>68710</td>\n",
       "      <td>0</td>\n",
       "      <td>@BSBSavedMyLife it won't play</td>\n",
       "      <td>[@BSBSavedMyLife, it, wo, n't, play]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90592</th>\n",
       "      <td>90604</td>\n",
       "      <td>0</td>\n",
       "      <td>@chiniehdiaz Im out of it..havent had any in d...</td>\n",
       "      <td>[@chiniehdiaz, I, m, out, of, it, .., havent, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55359</th>\n",
       "      <td>55371</td>\n",
       "      <td>1</td>\n",
       "      <td>@barrymoltz Ahhh, yes-- shiny object syndrome....</td>\n",
       "      <td>[@barrymoltz, Ahhh, ,, yes--, shiny, object, s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       ItemID  Sentiment                                      SentimentText  \\\n",
       "1266     1267          1                        Awesome. &lt;3 TEDDY! &lt;3   \n",
       "39898   39910          1  @Andre_R its been my mission to find other sou...   \n",
       "88879   88891          1                                @Azura999 Told you    \n",
       "3688     3689          1                        thanks for the birthday DMs   \n",
       "68437   68449          0    @bryandl i know!  you should come visit again!!   \n",
       "28556   28568          0       @airlanggatwerp bagi link nya dong nce huhu    \n",
       "48974   48986          1  @AriaParadiso @ChelseaParadiso nighty night u ...   \n",
       "68698   68710          0                     @BSBSavedMyLife it won't play    \n",
       "90592   90604          0  @chiniehdiaz Im out of it..havent had any in d...   \n",
       "55359   55371          1  @barrymoltz Ahhh, yes-- shiny object syndrome....   \n",
       "\n",
       "                                              token_list  \n",
       "1266         [ , Awesome, ., &, lt;3, TEDDY, !, &, lt;3]  \n",
       "39898  [@Andre_R, its, been, my, mission, to, find, o...  \n",
       "88879                             [@Azura999, Told, you]  \n",
       "3688                [ , thanks, for, the, birthday, DMs]  \n",
       "68437  [@bryandl, i, know, !,  , you, should, come, v...  \n",
       "28556  [@airlanggatwerp, bagi, link, nya, dong, nce, ...  \n",
       "48974  [@AriaParadiso, @ChelseaParadiso, nighty, nigh...  \n",
       "68698               [@BSBSavedMyLife, it, wo, n't, play]  \n",
       "90592  [@chiniehdiaz, I, m, out, of, it, .., havent, ...  \n",
       "55359  [@barrymoltz, Ahhh, ,, yes--, shiny, object, s...  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SpaCy way (fast)  \n",
    "SpaCy allows multi-processing by treating texts as a stream and yielding Doc objects."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.1 s ± 86.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "token_list = []\n",
    "\n",
    "for doc in nlp.pipe(df.SentimentText.astype('unicode').values, batch_size=100, n_threads=40):\n",
    "    word_list = []\n",
    "    for token in doc:\n",
    "        word_list.append(token.text)\n",
    "        \n",
    "    token_list.append(word_list)\n",
    "\n",
    "df['token_list2'] = token_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SpaCy way (faster)  \n",
    "When an `nlp` object is created, spaCy adds pipelines. By disabling unused pipeline components, spaCy can become even faster! Pipelines can be customized.    \n",
    "\n",
    "![nlp_pipeline](img/nlp_pipeline.png)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fb268b9fda0>)\n",
      "('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fb268ab3ca8>)\n",
      "('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fb268ab3d08>)\n"
     ]
    }
   ],
   "source": [
    "# Print pipeline components\n",
    "for p in nlp.pipeline:\n",
    "    print(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "139 ms ± 461 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "token_list = []\n",
    "\n",
    "# Disable POS, Dependency Parser, and NER since all we want is tokenizer \n",
    "# Alternatively, you can use nlp.make_doc method, which skips all pipelines, if you just need a tokenizer.\n",
    "with nlp.disable_pipes('tagger', 'parser', 'ner'):\n",
    "    for doc in nlp.pipe(df.SentimentText.astype('unicode').values, batch_size=100, n_threads=40):\n",
    "        word_list = []\n",
    "        for token in doc:\n",
    "            word_list.append(token.text)\n",
    "\n",
    "        token_list.append(word_list)\n",
    "\n",
    "df['token_list3'] = token_list"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
